                            widana_format
                            =============

## Date: 2013-03-04 20:59:41 PST


## Directory and file settings
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  CAVR2009=
  CAVR2010=
  SSFILE=
  DIRECT=
  SSDIRECT=
  ZIPDIRECT=

  set -x

## Format 2009 CAVR
## ~~~~~~~~~~~~~~~~~
  ######################
  # convert tsv to csv #
  ######################

  # replace tab with comma # remove alternate delimiter character "|"
  sed -e 's/\t/,/g' -e 's/|//g' "$CAVR2009" > $DIRECT/cavr2008_unformatted.csv

  ################################################################################
  # extract identification fields, concatenate, and format for SS matching and R #
  ################################################################################

  # skip header, print identification fields with alternate delimiter "|" within id field and commas between them # remove spaces # replace empty records with NA
  awk -F, 'NR!=1 {print $1,$2,"|"$3"|"$4"|","|"$15"|"$10"|"$9"|"$12"|"$7"|"$8"|"$14"|",$3, $17, $16;}' OFS=',' $DIRECT/cavr2008_unformatted.csv |
  sed -e 's/ //g' -e ':repeat; s/,,/,NA,/; t repeat' > $DIRECT/cavr_id.csv

  ##########################
  # format DOB to YYYYMMDD #
  ##########################

  # print dob field # reorder digits
  awk -F, 'NR!=1 {print $27}' OFS=',' $DIRECT/cavr2008_unformatted.csv | sed 's/^\(..\)\/\(..\)\/\(....\)$/\3\1\2/' > $DIRECT/cavr_dob.csv

  # paste together cavr_id and cavr_dob # insert dob into pid
  paste -d\, $DIRECT/cavr_id.csv $DIRECT/cavr_dob.csv | awk -F, '{print $1, $2, "|" $8 $3, $4, $5, $6, $7;}' OFS=, > $DIRECT/cavr_pid.csv

  rm $DIRECT/cavr_id.csv
  rm $DIRECT/cavr_dob.csv

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  #
  # cavr_pid.csv fields:
  #
  # $1= Locality Code      $2= Registrant ID       $3= PID                 $4= HID
  #
  # $5= Last Name          $6= Zip                 $7= State
  #
  # PID = |27Date_of_Birth|3Last_Name|4First_Name|
  # HID = |15City|10St.Name|9St.Dir.Pre|12St.Dir.Sff|7Addr.Num.|8Addr.Num.Suff.|14Unit_Number|
  #
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


  ########################################################
  # calculate Julian since 19001231 (Julian day 2415385) #
  ########################################################

  # print dob # reorder digits # remove zeros
  awk -F, 'NR!=1 {print $27}' OFS=',' $DIRECT/cavr2008_unformatted.csv | sed 's/^\(..\)\/\(..\)\/\(....\)$/\3\,\1\,\2\,/' | awk -F, '{print $1 + 0, $2 + 0, $3 + 0}' OFS=, > $DIRECT/cavr_dob_b2.csv


  # calculate Julian days
  # bash floors results automatically

  # turn off tracking
  set +x

  # will append to file, remove possible existing
  rm $DIRECT/cavr_dob_b1.csv

  # loop, three columns, set y m d for use in julian day algorithm, compile line by line
  while IFS=, read -r y m d; do
  dobj=$(($d-32075+(1461*($y+4800+($m-14)/12))/4+(367*($m-2-12*(($m-14)/12)))/12-(3*(($y+4900+($m-14)/12)/100))/4-2415385))
  echo $dobj >> $DIRECT/cavr_dob_b1.csv
  done < $DIRECT/cavr_dob_b2.csv

  # turn on tracking
  set -x

  # remove erroneous "\n"
  sed 's/\n//' $DIRECT/cavr_dob_b1.csv > $DIRECT/cavr_dob_julian.csv

  rm $DIRECT/cavr_dob_b2.csv
  rm $DIRECT/cavr_dob_b1.csv

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # cavr_dob_julian.csv field:
  # $1 = Date of Birth (in Julian days since December 31, 1900)
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  # create dummy dod column
  awk -F, 'NR!=1{print 999999;}' OFS=, $DIRECT/cavr2008_unformatted.csv > $DIRECT/cavr_dod.csv

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # cavr_dod.csv field:
  # $1 = Date of Death (in Julian days since December 31, 1900)
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  ##############################
  # format regdate to YYYYMMDD #
  ##############################

  # print regdate # reorder digits
  awk -F, 'NR!=1 {print $32}' OFS=',' $DIRECT/cavr2008_unformatted.csv | sed 's/^\(..\)\/\(..\)\/\(....\)$/\3\1\2/' > $DIRECT/cavr_regdate.csv

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # cavr_regdate.csv field:
  # $1 = registration date (YYYYMMDD)
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


  ################################################################################
  # extract  strat fields, format for R, replace NA with M/F when Mr Ms Mrs Miss #
  ################################################################################

  # print stratification fields # remove spaces, replace tab with comma (unnecessary..?) # impute Mrs # impute Ms # impute Miss # impute Mr # remove dead wood fields
  awk -F, 'NR!=1 {print $29,$28,$56,$36,$33,$34}' OFS=',' $DIRECT/cavr2008_unformatted.csv | sed -e 's/ //g' -e 's/\t//g' -e ':repeat; s/,,/,NA,/; t repeat' | awk -F ',' '$3 == "MRS" && $2 == "NA" {$2="F"}1'  OFS=',' | awk -F ',' '$3 == "MS" && $2 == "NA"{$2="F"}1'  OFS=',' | awk -F ',' '$3 == "MISS" && $2 == "NA"{$2="F"}1'  OFS=',' | awk -F ',' '$3 == "MR" && $2 == "NA"{$2="M"}1'  OFS=',' | cut -d\, -f1,2,4-6 > $DIRECT/cavr_strat.csv

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # cavr_strat.csv fields:
  # $1= Party             $2= Gender (incld. Name Prefix) $3= Assistance Flag
  # $4= Precinct  $5= Precinct Part
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


  #################################################
  # create binary for most recent eight elections #
  #################################################

  # extract votes; and concatenate into one field for matching
  awk -F, 'NR!=1 {print $58 $59 $60 $61 $62 $63 $64 $65}' $DIRECT/cavr2008_unformatted.csv > $DIRECT/cavr_votes1.csv


  # 0=no vote, 1=vote for PG8 DP8 PP8 GG6 GP6 SS5 PG4 PP4 (descending chronological order); print to separate fields
  awk -F, '{print  "NA", $1~/PG8/, $1~/DP8/, $1~/PP8/, $1~/GG6/, $1~/GP6/, $1~/SS5/, $1~/PG4/, $1~/PP4/}'  OFS=, $DIRECT/cavr_votes1.csv > $DIRECT/cavr_votes.csv

  rm $DIRECT/cavr_votes1.csv

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # cavr_votes.csv fields:
  # $1=   CAVR2011 votes ("NA")   $2= PG8         $3= DP8         $4= PP8
  # $5=  GG6              $6= GP6 $7= SS5         $8= PG4         $9= PP4
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  ##################################################
  # finished with cavr2008_unformatted.csv, delete #

  rm $DIRECT/cavr2008_unformatted.csv


  ##################
  # combine blocks #
  ##################


  # paste blocks together
  paste -d\, $DIRECT/cavr_pid.csv $DIRECT/cavr_votes.csv $DIRECT/cavr_dob_julian.csv $DIRECT/cavr_dod.csv $DIRECT/cavr_strat.csv $DIRECT/cavr_regdate.csv > $DIRECT/cavr2008_formatted1.csv


  # change all characters to uppercase # remove alternate delimiter characters
  tr '[:lower:]' '[:upper:]' < $DIRECT/cavr2008_formatted1.csv | sed -e 's/_//g' -e 's/://g'  > $DIRECT/cavr_2008.csv

  rm $DIRECT/cavr2008_formatted1.csv


  # finished with blocks, delete all
  rm $DIRECT/cavr_pid.csv
  rm $DIRECT/cavr_votes.csv
  rm $DIRECT/cavr_dob_julian.csv
  rm $DIRECT/cavr_dod.csv
  rm $DIRECT/cavr_strat.csv
  rm $DIRECT/cavr_regdate.csv




  ###########################################
  # create PID multiple filter for CAVR2009 #
  ###########################################

  # create two column matrix, first column pid, second number of occurrences # save records of occupancy >= 2
  awk -F, '{pid[$3]++;}END{for (i in pid) print i, pid[i] ","}' OFS=, $DIRECT/cavr_2008.csv | awk -F, '$2>=2' OFS=, > $DIRECT/cavr2008_PIDmultiplefilter.csv

  ###########################################
  # create HID multiple filter for CAVR2009 #
  ###########################################

  # create two column matrix, first column hid, second occupancy # save records of occupancy >= 7
  awk -F, '{hid[$4]++;}END{for (i in hid) print i, hid[i] ","}' OFS=, $DIRECT/cavr_2008.csv | awk -F, '$2>=7' OFS=, > $DIRECT/cavr2008_HIDmultiplefilter.csv


## Match SSDMF and 2009 CAVR
## ~~~~~~~~~~~~~~~~~~~~~~~~~~
  # associate pid and date of death # if pid match, replace dod column (18) with date of death, if no match, keep 999999. all multiple pids are removed later, along with the records of all household members
  awk -F, 'NR==FNR{a[$1]=$5;next}a[$3]{$18=a[$3]}1' OFS=, $SSFILE $DIRECT/cavr_2008.csv > $DIRECT/cavr_2008_with_ssdi.csv

  rm $DIRECT/cavr_2008.csv

  # ssdmf filter is included in its format script

## Format 2010 CAVR
## ~~~~~~~~~~~~~~~~~
  # convert tsv to csv # remove any pipe characters to allow for use as alternate delimiter
  sed -e 's/\t/,/g' -e 's/|//g' "$CAVR2010" > $DIRECT/cavr2010_unformatted.csv


  #########################################################
  # extract identification fields and format for matching #
  #########################################################

  # print last name and first name # remove spaces and replace empty fields with NA (not applicable but still kept in the code)
  awk -F, 'NR!=1 {print "|"$3"|"$4"|";}' OFS=',' $DIRECT/cavr2010_unformatted.csv | sed -e 's/ //g' -e ':repeat; s/,,/,NA,/; t repeat' > $DIRECT/cavr_id.csv


  # print formatted date (yyyymmdd) to cavr_dob.csv
  awk -F, 'NR!=1 {print $27}' OFS=',' $DIRECT/cavr2010_unformatted.csv | sed 's/^\(..\)\/\(..\)\/\(....\)$/\3\1\2/' > $DIRECT/cavr_dob.csv


  # combine cavr_id.csv and cavr_dob.csv
  paste -d\, $DIRECT/cavr_id.csv $DIRECT/cavr_dob.csv > $DIRECT/cavr_pid1.csv

  rm $DIRECT/cavr_id.csv
  rm $DIRECT/cavr_dob.csv


  # add Gregorian DOB to PID
  awk -F, '{print "|"$2 $1;}' OFS=, $DIRECT/cavr_pid1.csv > $DIRECT/cavr_pid.csv

  rm $DIRECT/cavr_pid1.csv

  #####################################
  # create binary for GG10, GP10, SS9 #
  #####################################


  # extract votes; and concatenate into one field for matching ##
  awk -F, 'NR!=1 {print $66 $67 $68 $69 $70 $71 $72 $73}' $DIRECT/cavr2010_unformatted.csv > $DIRECT/cavr_votes1.csv

  rm $DIRECT/cavr2010_unformatted.csv


  # 0=no vote, 1=vote for GG10 GP10 SS9; and print to separate fields ##
  awk -F, '{print  $1~/GG10/, $1~/GP10/, $1~/SS9/}'  OFS=, $DIRECT/cavr_votes1.csv > $DIRECT/cavr_votes.csv

  rm $DIRECT/cavr_votes1.csv


  ##################
  # combine blocks #
  ##################

  # paste together pid and 2010 votes
  paste -d\, $DIRECT/cavr_pid.csv $DIRECT/cavr_votes.csv > $DIRECT/cavr2010_formatted1.csv

  rm $DIRECT/cavr_pid.csv
  rm $DIRECT/cavr_votes.csv


  # change all characters to uppercase and remove _ : to allow use of alternate delimiters
  tr '[:lower:]' '[:upper:]' < $DIRECT/cavr2010_formatted1.csv | sed -e 's/_//g' -e 's/://g'  > $DIRECT/cavr_2010.csv

  rm $DIRECT/cavr2010_formatted1.csv



  ###########################################
  # create PID multiple filter for CAVR2010 #
  ###########################################

  # create two column matrix, first column pid, second number of occurrences # save records of occupancy >= 2
  awk -F, '{pid[$1]++;}END{for (i in pid) print i, pid[i] ","}' OFS=, $DIRECT/cavr_2010.csv | awk -F, '$2>=2' OFS=, > $DIRECT/cavr2010_PIDmultiplefilter.csv

## Match 2010 CAVR to 2009 CAVR
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # associate pid with GG10, GP10 and SS9 votes # if pid match, replace column eight with GG10, GP10, and SS9 votes, if no match replace column eight with NA, NA, NA
  awk -F, 'NR==FNR{votes2011[$1]=$2 "," $3 "," $4; next}{if ($3 in votes2011) {print $1, $2, $3, $4, $5, $6, $7, votes2011[$3], $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24;} else {print $1, $2, $3, $4, $5, $6, $7, "NA", "NA", "NA", $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24;}}' OFS=, $DIRECT/cavr_2010.csv $DIRECT/cavr_2008_with_ssdi.csv > $DIRECT/cavr_2008_with_ssdi2010.csv

  rm $DIRECT/cavr_2008_with_ssdi.csv
  rm $DIRECT/cavr_2010.csv

## Insert ages and date of death variables
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## Age
## ====
  # 19010101 is set to Julian day 1 (Julian day minus 2415385)

  # create filter for records with birth dates before January 1, 1901 (true birthdates in 1900 are indistinguisable from DOB year NAs)
  awk -F, '$19<=0' OFS=, $DIRECT/cavr_2008_with_ssdi2010.csv | awk -F, '{print $3, $19, $20}' OFS=, > $DIRECT/cavrDOB_PIDmultiplefilter.csv

  # print dob and dod to separate csv # change NA to -1 so that it will be apparent in the code (a negative lifespan)
  awk -F, '{print $19, $20}' OFS=, $DIRECT/cavr_2008_with_ssdi2010.csv | sed 's/999999/-1/g' > $DIRECT/cavr2008_2010_dobconvert.csv

  rm $DIRECT/cavr_2008_with_ssdi2010_noDOB_NA.csv

  ###################################
  # calculate age for each election #
  ###################################


  # calculate Julian days since December 31, 1900 for each election date
  . ./juliandays_elections.bash


  set +x

  rm $DIRECT/cavr_dob_b2.csv

  # subtract dob from election date, divide by 365.25 to get age on election day
  # subtract dob from dod, divide by 365.25 to get age at death
  while IFS=, read -r dob dod; do

  age_gg10_e=$((($gg10jd-$dob)*4/1461))
  age_gp10_e=$((($gp10jd-$dob)*4/1461))
  age_ss9_e=$((($ss9jd-$dob)*4/1461))
  age_pg8_e=$((($pg8jd-$dob)*4/1461))
  age_dp8_e=$((($dp8jd-$dob)*4/1461))
  age_pp8_e=$((($pp8jd-$dob)*4/1461))
  age_gg6_e=$((($gg6jd-$dob)*4/1461))
  age_gp6_e=$((($gp6jd-$dob)*4/1461))
  age_ss5_e=$((($ss5jd-$dob)*4/1461))
  age_pg4_e=$((($pg4jd-$dob)*4/1461))
  age_pp4_e=$((($pp4jd-$dob)*4/1461))
  ageatdeath_e=$((($dod-$dob)))

  echo $age_gg10_e","$age_gp10_e","$age_ss9_e","$age_pg8_e","$age_dp8_e","$age_pp8_e","$age_gg6_e","$age_gp6_e","$age_ss5_e","$age_pg4_e","$age_pp4_e","$ageatdeath_e"," >> $DIRECT/cavr_dob_b2.csv

  done < $DIRECT/cavr2008_2010_dobconvert.csv

  set -x

  # remove erroneous "\n" # insert 99999 into ageatdeath column for dod NA
  sed 's/\n//' $DIRECT/cavr_dob_b2.csv | awk -F, '$12<=0{$12=999999}1' OFS=, > $DIRECT/cavr2008_2010_agesinyears.csv

  rm $DIRECT/cavr_dob_b2.csv
  rm $DIRECT/cavr2008_2010_dobconvert.csv

## Add ages to cavr file
## ======================
  # add ages to cavr file
  paste -d\, $DIRECT/cavr_2008_with_ssdi2010.csv $DIRECT/cavr2008_2010_agesinyears.csv > $DIRECT/cavr2008_2010_withages.csv

  rm $DIRECT/cavr2008_2010_agesinyears.csv
  rm $DIRECT/cavr_2008_with_ssdi2010.csv

## Reorder
## ========
  ###########################################
  # reorder and insert NA for DOD variables #
  ###########################################

  awk -F, '{print $3, $4, $5, $6, $7, $1, $24, $25, $26, $19, $20, $38, $22, $21, $8, $27, "NA", $9, $28, "NA", $10, $29, "NA", $11, $30, "NA", $12, $31, "NA", $13, $32, "NA", $14, $33, "NA", $15, $34, "NA", $16, $35, "NA", $17, $36, "NA", $18, $37, "NA"}' OFS=, $DIRECT/cavr2008_2010_withages.csv > $DIRECT/cavr2008_2010_formatted.csv

  rm $DIRECT/cavr2008_2010_withages.csv

## Insert date of death variables
## ===============================
  #######################################
  # calculate days from death variables #
  #######################################

  # calculate Julian days since December 31, 1900 for each election date
  . ./juliandays_elections.bash


  # insert days from death data for records with observed death # replace days from death NAs with 999999
  awk -F, -v gg10jd=$gg10jd -v gp10jd=$gp10jd -v ss9jd=$ss9jd -v pg8jd=$pg8jd -v pp8jd=$pp8jd -v dp8jd=$dp8jd -v gg6jd=$gg6jd -v gp6jd=$gp6jd -v ss5jd=$ss5jd -v pg4jd=$pg4jd -v pp4jd=$pp4jd '$11!="999999"{egodod=$11; $17=gg10jd-egodod; $20=gp10jd-egodod; $23=ss9jd-egodod; $26=pg8jd-egodod; $29=dp8jd-egodod; $32=pp8jd-egodod; $35=gg6jd-egodod; $38=gp6jd-egodod; $41=ss5jd-egodod; $44=pg4jd-egodod; $47=pp4jd-egodod}1' OFS=, $DIRECT/cavr2008_2010_formatted.csv | awk -F, '$11=="999999"{$17=999999; $20=999999; $23=999999; $26=999999; $29=999999; $32=999999; $35=999999; $38=999999; $41=999999; $44=999999; $47=999999}1' OFS=, > $DIRECT/cavr2008_2010_formatted_withdoddata.csv

  rm $DIRECT/cavr2008_2010_formatted.csv

  # add R-friendly variables
  awk -F, '$11!=999999{print $0, $13~/M/, $13~/F/, $14~/DEM/, $14~/REP/, 1}$11==999999{print $0, $13~/M/, $13~/F/, $14~/DEM/, $14~/REP/, 0}' OFS=, $DIRECT/cavr2008_2010_formatted_withdoddata.csv > $DIRECT/cavr_egodata1.csv

  rm $DIRECT/cavr2008_2010_formatted_withdoddata.csv

## Verification 1
## ~~~~~~~~~~~~~~~
  # verify that pg8_e is binary # verify that gender_e is F, M, or NA
  awk -F, '$24==0||$24==1' OFS=, $DIRECT/cavr_egodata1.csv | awk -F, '$13=="F"||$13=="M"||$13=="NA"' OFS=, > $DIRECT/cavr_egodata.csv

  awk -F, '$24!=0&&$24!=1' OFS=, $DIRECT/cavr_egodata1.csv | awk -F, '$13!="F"&&$13!="M"&&$13!="NA"' OFS=, > $DIRECT/cavr_egodata_removed.csv

  rm $DIRECT/cavr_egodata1.csv


  ### print stats

  awk -F, 'END{print "Number of voters in cavr_egodata: " NR "\n"}' OFS=, $DIRECT/cavr_egodata.csv > $DIRECT/cavr_format_stats.txt

  awk -F, '$15=="NA"' OFS=, $DIRECT/cavr_egodata.csv | awk 'END{print "Number of voters in cavr 2009 not in cavr 2010: " NR "\n"}' OFS=, >> $DIRECT/cavr_format_stats.txt

## Match households
## ~~~~~~~~~~~~~~~~~
  # reorder by HID, insert alternate delimiters ":" "_" to allow for blocking/splitting off of pid and descriptor variables
  awk -F, '{print $2,$1 "_" $3 ":" $4 ":" $5 ":" $6 ":" $7 ":" $8 ":" $9 ":" $10 ":" $11 ":" $12 ":" $13 ":" $14 ":" $15 ":" $16 ":" $17 ":" $18 ":" $19 ":" $20 ":" $21 ":" $22 ":" $23 ":" $24 ":" $25 ":" $26 ":" $27 ":" $28 ":" $29 ":" $30 ":" $31 ":" $32 ":" $33 ":" $34 ":" $35 ":" $36 ":" $37 ":" $38 ":" $39 ":" $40 ":" $41 ":" $42 ":" $43 ":" $44 ":" $45 ":" $46 ":" $47 ":" $48 ":" $49 ":" $50 ":" $51 ":" $52 ","}' OFS=, $DIRECT/cavr_egodata.csv > $DIRECT/cavr_byhid.csv

  # removes all households of 7 or more occupants (by the filter created earlier, not by the number of alters)
  awk -F, 'NR==FNR{hid[$1]; next}!($1 in hid)' OFS=, $DIRECT/cavr2008_HIDmultiplefilter.csv $DIRECT/cavr_byhid.csv > $DIRECT/cavr_byhid_under7occ.csv

  rm $DIRECT/cavr_byhid.csv
  #rm $DIRECT/cavr2008_HIDmultiplefilter.csv

  # match PIDs to HIDs, creates household file which includes all household members
  awk -F, '{a[$1]=($1 in a)?a[$1] "," $2:$2}END{for(i in a) print i , a[i]}' OFS=, $DIRECT/cavr_byhid_under7occ.csv > $DIRECT/cavr_households.csv

  rm $DIRECT/cavr_byhid_under7occ.csv

  # add empty fields where relevant # insert NA_NA for empty PID_descript field
  awk -F, '{print $1, $2, $3, $4, $5, $6, $7, $8 ","}' OFS=, $DIRECT/cavr_households.csv | sed ':repeat; s/,,/,NA_NA,/; t repeat' | awk -F, '$3!="NA_NA"' OFS=, > $DIRECT/cavr_households_withNAs.csv

  rm $DIRECT/cavr_households.csv

  # print stats

  awk -F, 'END{print "Total number of households with at least two registered voters: " NR "\n"}' OFS=, $DIRECT/cavr_households_withNAs.csv >> $DIRECT/cavr_format_stats.txt


## Filter duplicates and indistinguishable PIDs
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # separate pids and descriptor variables
  sed -e 's/_/,/g' $DIRECT/cavr_households_withNAs.csv > $DIRECT/cavr_forpidfilters.csv

  rm $DIRECT/cavr_households_withNAs.csv

  # removes all households with an occupant matching any duplicate pid in the 2009 california voter record
  awk -F, 'NR==FNR{pid[$1]; next}!($2 in pid)&&!($4 in pid)&&!($6 in pid)&&!($8 in pid)&&!($10 in pid)&&!($12 in pid)&&!($14 in pid)' OFS=, $DIRECT/cavr2008_PIDmultiplefilter.csv $DIRECT/cavr_forpidfilters.csv > $DIRECT/cavr_filtered_2008.csv

  #rm $DIRECT/cavr2008_PIDmultiplefilter.csv
  rm $DIRECT/cavr_forpidfilters.csv

  # removes all households with an occupant matching any duplicate pid in the 2010 california voter record
  awk -F, 'NR==FNR{pid[$1]; next}!($2 in pid)&&!($4 in pid)&&!($6 in pid)&&!($8 in pid)&&!($10 in pid)&&!($12 in pid)&&!($14 in pid)' OFS=, $DIRECT/cavr2010_PIDmultiplefilter.csv $DIRECT/cavr_filtered_2008.csv > $DIRECT/cavr_filtered_2008_2010.csv

  #rm $DIRECT/cavr2010_PIDmultiplefilter.csv
  rm $DIRECT/cavr_filtered_2008.csv

  # removes all households with an occupant matching any duplicate pid in the social security death index
  awk -F, 'NR==FNR{pid[$1]; next}!($2 in pid)&&!($4 in pid)&&!($6 in pid)&&!($8 in pid)&&!($10 in pid)&&!($12 in pid)&&!($14 in pid)' OFS=, $SSDIRECT/ssdmf_full_PIDmultiplefilter.csv $DIRECT/cavr_filtered_2008_2010.csv > $DIRECT/cavr_filtered_2008_2010_ssdi.csv

  #rm $DIRECT/ssdm_CAonly_PIDmultiplefilter.csv
  rm $DIRECT/cavr_filtered_2008_2010.csv

  # print stats

  awk -F, 'END{print "Households after removing duplicate PIDs: " NR "\n"}' OFS=, $DIRECT/cavr_filtered_2008_2010_ssdi.csv >> $DIRECT/cavr_format_stats.txt

  # removes all households with an occupant whose DOB is NA
  awk -F, 'NR==FNR{pid[$1]; next}!($2 in pid)&&!($4 in pid)&&!($6 in pid)&&!($8 in pid)&&!($10 in pid)&&!($12 in pid)&&!($14 in pid)' OFS=, $DIRECT/cavrDOB_PIDmultiplefilter.csv $DIRECT/cavr_filtered_2008_2010_ssdi.csv > $DIRECT/cavr_filtered_2008_2010_ssdi_DOB.csv

  rm $DIRECT/cavr_filtered_2008_2010_ssdi.csv

  awk -F, 'END{print "Households after removing unavailable DOBs: " NR "\n"}' OFS=, $DIRECT/cavr_filtered_2008_2010_ssdi_DOB.csv >> $DIRECT/cavr_format_stats.txt


  # re-block pid and descriptor variables
  awk -F, '$4=="NA" && $6=="NA" && $8=="NA" && $10=="NA" && $12=="NA" && $14=="NA" {print $1, $2 "_" $3, "NA" ",""NA" ",""NA" ",""NA" ",""NA" ",""NA" ","} $4!="NA" && $6=="NA" && $8=="NA" && $10=="NA" && $12=="NA" && $14=="NA" {print $1, $2 "_" $3, $4 "_" $5, "NA" ",""NA" ",""NA" ",""NA" ",""NA" ","} $4!="NA" && $6!="NA" && $8=="NA" && $10=="NA" && $12=="NA" && $14=="NA" {print $1, $2 "_" $3, $4 "_" $5, $6 "_" $7, "NA" ",""NA" ",""NA" ",""NA" ","} $4!="NA" && $6!="NA" && $8!="NA" && $10=="NA" && $12=="NA" && $14=="NA" {print $1, $2 "_" $3, $4 "_" $5, $6 "_" $7, $8 "_" $9, "NA" ",""NA" ",""NA" ","} $4!="NA" && $6!="NA" && $8!="NA" && $10!="NA" && $12=="NA" && $14=="NA" {print $1, $2 "_" $3, $4 "_" $5, $6 "_" $7, $8 "_" $9, $10 "_" $11, "NA" ",""NA" ","} $4!="NA" && $6!="NA" && $8!="NA" && $10!="NA" && $12!="NA" && $14=="NA" {print $1, $2 "_" $3, $4 "_" $5, $6 "_" $7, $8 "_" $9, $10 "_" $11, $12 "_" $13, "NA" ","} $4!="NA" && $6!="NA" && $8!="NA" && $10!="NA" && $12!="NA" && $14!="NA" {print $1, $2 "_" $3, $4 "_" $5, $6 "_" $7, $8 "_" $9, $10 "_" $11, $12 "_" $13, $14 "_" $15 ","} ' OFS=, $DIRECT/cavr_filtered_2008_2010_ssdi_DOB.csv > $DIRECT/cavr_byhid_filtered.csv

  rm $DIRECT/cavr_filtered_2008_2010_ssdi_DOB.csv


## Arrange in dyads
## ~~~~~~~~~~~~~~~~~
  ###################################################################
  # ego-alter file for households of up to six registered occupants #
  ###################################################################

  # re-order by pids (creates a record for each occupant in household) and insert the household occupancy count (by number of alters plus one)
  awk -F, '$3=="NA" && $4=="NA" && $5=="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $2, $1 "_" 1, $3, $4, $5, $6, $7, $8 ","} $3!="NA" && $4=="NA" && $5=="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $2, $1 "_" 2, $3, $4, $5, $6, $7, $8 "\n" $3, $1 "_" 2, $2, $4, $5, $6, $7, $8 ","} $3!="NA" && $4!="NA" && $5=="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $2, $1 "_" 3, $3, $4, $5, $6, $7, $8 "\n" $3, $1 "_" 3, $2, $4, $5, $6, $7, $8 "\n" $4, $1 "_" 3, $2, $3, $5, $6, $7, $8 ","} $3!="NA" && $4!="NA" && $5!="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $2, $1 "_" 4, $3, $4, $5, $6, $7, $8 "\n" $3, $1 "_" 4, $2, $4, $5, $6, $7, $8 "\n" $4, $1 "_" 4, $2, $3, $5, $6, $7, $8 "\n" $5, $1 "_" 4, $2, $3, $4, $6, $7, $8 ","} $3!="NA" && $4!="NA" && $5!="NA" && $6!="NA" && $7=="NA" && $8=="NA" {print $2, $1 "_" 5, $3, $4, $5, $6, $7, $8 "\n" $3, $1 "_" 5, $2, $4, $5, $6, $7, $8 "\n" $4, $1 "_" 5, $2, $3, $5, $6, $7, $8 "\n" $5, $1 "_" 5, $2, $3, $4, $6, $7, $8 "\n" $6, $1 "_" 5, $2, $3, $4, $5, $7, $8 ","} $3!="NA" && $4!="NA" && $5!="NA" && $6!="NA" && $7!="NA" && $8=="NA" {print $2, $1 "_" 6, $3, $4, $5, $6, $7, $8, "\n" $3, $1 "_" 6, $2, $4, $5, $6, $7, $8, "\n" $4, $1 "_" 6, $2, $3, $5, $6, $7, $8, "\n" $5, $1 "_" 6, $2, $3, $4, $6, $7, $8, "\n" $6, $1 "_" 6, $2, $3, $4, $5, $7, $8, "\n"  $7, $1 "_" 6, $2, $3, $4, $5, $6, $8 ","}' OFS=, $DIRECT/cavr_byhid_filtered.csv > $DIRECT/cavr2008_2010_byEgo.csv

  rm $DIRECT/cavr_byhid_filtered.csv

  #########################
  # ego-alter file, dyads #
  #########################

  # print all permutations
  awk -F, '$3=="NA" && $4=="NA" && $5=="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $1, $2, "NA" ","} $3!="NA" && $4=="NA" && $5=="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $1, $2, $3 ","} $3!="NA" && $4!="NA" && $5=="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $1, $2, $3, "\n" $1, $2, $4 ","} $3!="NA" && $4!="NA" && $5!="NA" && $6=="NA" && $7=="NA" && $8=="NA" {print $1, $2, $3, "\n" $1, $2, $4, "\n" $1, $2, $5 ","} $3!="NA" && $4!="NA" && $5!="NA" && $6!="NA" && $7=="NA" && $8=="NA" {print $1, $2, $3, "\n" $1, $2, $4, "\n" $1, $2, $5, "\n" $1, $2, $6 ","} $3!="NA" && $4!="NA" && $5!="NA" && $6!="NA" && $7!="NA" && $8=="NA" {print $1, $2, $3, "\n" $1, $2, $4, "\n" $1, $2, $5, "\n" $1, $2, $6, "\n" $1, $2, $7 ","}' OFS=, $DIRECT/cavr2008_2010_byEgo.csv > $DIRECT/cavr2008_2010_dyads.csv

  rm $DIRECT/cavr2008_2010_byEgo.csv

  # expand the blocked records by replacing the pid and descriptor separator (_) with a comma, replacing empty alter records with enough NAs to fill all alter cells, and replacing the descriptor separators (:) with commas
  sed -e 's/_/,/g' -e ':repeat; s/,NA,/,NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA:NA,/; t repeat' -e 's/:/,/g' -e 's/,$//g' $DIRECT/cavr2008_2010_dyads.csv | awk -F, '$54!="NA"' OFS=, > $DIRECT/cavr_expandeddyads.csv

  rm $DIRECT/cavr2008_2010_dyads.csv

## Calculate ego-alter variables
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ########################################################
  # calculate ego-alter variables and add to end of file #
  ########################################################


  # print dob_e and dob_a to separate file
  awk -F, '{print $9, $62}' OFS=, $DIRECT/cavr_expandeddyads.csv > $DIRECT/cavr2008_2010_eadyads_datacalc.csv

  set +x

  rm $DIRECT/cavr_dob_b3.csv

  # calculate age discrepancy by subtracting dob_a from dob_e and dividing by 365.25 to get the age discrepancy in years
  while IFS=, read -r dob_e dob_a; do

  ea_agediscrep=$((($dob_e-(($dob_a)))*4/1461))

  echo $ea_agediscrep >> $DIRECT/cavr_dob_b3.csv

  done < $DIRECT/cavr2008_2010_eadyads_datacalc.csv

  set -x

  # remove erroneous "\n" # add R-friendly ego-alter variables
  sed 's/\n//' $DIRECT/cavr_dob_b3.csv > $DIRECT/cavr2008_2010_eadata.csv

  rm $DIRECT/cavr_dob_b3.csv
  rm $DIRECT/cavr2008_2010_eadyads_datacalc.csv

  # add age discrepancy to end of ego alter dyad file
  paste -d\, $DIRECT/cavr_expandeddyads.csv $DIRECT/cavr2008_2010_eadata.csv > $DIRECT/cavr_eadyads1.csv

  rm $DIRECT/cavr_expandeddyads.csv
  rm $DIRECT/cavr2008_2010_eadata.csv

## Verification 2
## ~~~~~~~~~~~~~~~
  awk -F, '$14==0||$14==1||$14=="NA"' OFS=, $DIRECT/cavr_eadyads1.csv | awk -F, '$44==0||$44==1' OFS=, | awk -F, '$12=="M"||$12=="F"||$12=="NA"' OFS=, | awk -F, '$67==0||$67==1||$67=="NA"' OFS=, | awk -F, '$65=="M"||$65=="F"||$65=="NA"' OFS=, > $DIRECT/cavr_eadyads.csv

  awk -F, '$14!=0&&$14!=1&&$14!="NA"' OFS=, $DIRECT/cavr_eadyads1.csv | awk -F, '$44!=0&&$44!=1' OFS=, | awk -F, '$12!="M"&&$12!="F"&&$12!="NA"' OFS=, | awk -F, '$67!=0&&$67!=1&&$67!="NA"' OFS=, | awk -F, '$65!="M"&&$65!="F"&&$65!="NA"' OFS=, > $DIRECT/cavr_eadyads_removed.csv

  rm $DIRECT/cavr_eadyads1.csv


## House data (to remove deaths to another generation controls in the next section)
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  awk -F, '{occnum[$2]++;}END{for (i in occnum) print i, occnum[i]}' OFS=, $DIRECT/cavr_egodata.csv > $DIRECT/cavr_housedata1.csv

  awk -F, '{deathinhid[$2]+=$52;}END{for (i in deathinhid) print i, deathinhid[i]}' OFS=, $DIRECT/cavr_egodata.csv | sed 's/$/,/g' > $DIRECT/cavr_housedata2.csv

  # awk -F, '$2>=1{$2=1}1' OFS=, $DIRECT/cavr_housedata2a.csv > $DIRECT/cavr_housedata2.csv

  rm $DIRECT/cavr_housedata2a.csv

  join -j1 -t, $DIRECT/cavr_housedata1.csv $DIRECT/cavr_housedata2.csv | sed 's/$/,/g' > $DIRECT/cavr_housedata.csv

  rm $DIRECT/cavr_housedata1.csv
  rm $DIRECT/cavr_housedata2.csv

## Create analysis file
## ~~~~~~~~~~~~~~~~~~~~~
  ###
  # add death in hid
  ###

  awk -F, 'FNR==NR{deathinhid[$1]=$3; next}($52 in deathinhid){$53=$53 "," deathinhid[$52]}1' OFS=, $DIRECT/cavr_housedata.csv $DIRECT/cavr_eadyads.csv > $DIRECT/cavr_eadyads1.csv

  ##########
  # sample #
  ##########

  # print ego-alive alter-alive; print ego-alive alter-deceased # print same generation only # print same last name; exclude ego-alive alter-alive dyads in w/ death households
  awk -F, '$51==0&&$105==0{print $0} $51==0&&$105==1{print $0}' OFS=, $DIRECT/cavr_eadyads1.csv | awk -F, '$106>=-15&&$106<=15' OFS=,  | awk -F, '$2==$56&&$54==$105' OFS=, > $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly.csv

  rm $DIRECT/cavr_eadyads1.csv

  # two in generation
  awk -F, 'FNR==NR{a[$55]++; next}(a[$55]==1)' OFS=, $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly.csv $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly.csv > $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly_sample.csv

  rm $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly.csv

  awk -F, 'NR==1{print "pid_e,lastname_e,zip_e,state_e,localitycode_e,precinct_e,precinctpart_e,regdate_e,dob_e,dod_e,ageatdeath_e,gender_e,party_e,gg10_e,age_gg10_e,daysfromdeathgg10_e,gp10_e,age_gp10_e,daysfromdeathgp10_e,ss9_e,age_ss9_e,daysfromdeathss9_e,pg8_e,age_pg8_e,daysfromdeathpg8_e,dp8_e,age_dp8_e,daysfromdeathdp8_e,pp8_e,age_pp8_e,daysfromdeathpp8_e,gg6_e,age_gg6_e,daysfromdeathgg6_e,gp6_e,age_gp6_e,daysfromdeathgp6_e,ss5_e,age_ss5_e,daysfromdeathss5_e,pg4_e,age_pg4_e,daysfromdeathpg4_e,pp4_e,age_pp4_e,daysfromdeathpp4_e,male_e,female_e,dem_e,rep_e,death_e,hid,hid_occnum,deathinhid,pid_a,lastname_a,zip_a,state_a,localitycode_a,precinct_a,precinctpart_a,regdate_a,dob_a,dod_a,ageatdeath_a,gender_a,party_a,gg10_a,age_gg10_a,daysfromdeathgg10_a,gp10_a,age_gp10_a,daysfromdeathgp10_a,ss9_a,age_ss9_a,daysfromdeathss9_a,pg8_a,age_pg8_a,daysfromdeathpg8_a,dp8_a,age_dp8_a,daysfromdeathdp8_a,pp8_a,age_pp8_a,daysfromdeathpp8_a,gg6_a,age_gg6_a,daysfromdeathgg6_a,gp6_a,age_gp6_a,daysfromdeathgp6_a,ss5_a,age_ss5_a,daysfromdeathss5_a,pg4_a,age_pg4_a,daysfromdeathpg4_a,pp4_a,age_pp4_a,daysfromdeathpp4_a,male_a,female_a,dem_a,rep_a,death_a,ea_agediscrep"}' OFS=, $DIRECT/cavr_eadyads.csv | sed 's/daysfromdeath/dfd/g' > $DIRECT/cavr_fullheader.csv

  rm $DIRECT/cavr_eadyads.csv

  awk -F, 'NR==FNR{print $0; next}{print $0}' OFS=, $DIRECT/cavr_fullheader.csv $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly_sample.csv | cut -d\, -f1-26,29,32,35,38,41,44,47-53,55-80,83,86,89,92,95,98,101-106 | awk -F, '$76==1&&$53=="NA"{$53=989898; $56=989898; $59=989898}1' OFS=, >  $DIRECT/cavr_analysisfile1.csv

  rm $DIRECT/cavr2009_2011_eadyads_samegenlast_alterdeathonly_sample.csv

  # add on zip code stats
  awk -F, 'NR==FNR{geodat[$1]=$2 "," $3 "," $4 "," $5 "," $6; next}{if ($42 in geodat) {print $0, geodat[$42];} else {print $0, "NA", "NA", "NA", "NA", "NA";}}' OFS=, $ZIPDIRECT/zipdat_forcavr.csv $DIRECT/cavr_analysisfile1.csv > $DIRECT/cavr_analysisfile1_withzipdat.csv

  rm $DIRECT/cavr_analysisfile1.csv

  # produce subset of cases only
  awk -F, '$76=="death_a"||$76==1' OFS=, $DIRECT/cavr_analysisfile1_withzipdat.csv > $DIRECT/cavr_analysisfile1_casesonly_withzipdat.csv

  # print stats
  awk -F, '$76==0' OFS=, $DIRECT/cavr_analysisfile1_withzipdat.csv | awk -F, 'END{print "Total number of controls: " NR "\n"}' OFS=, >> $DIRECT/cavr_format_stats.txt

  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # $DIRECT/cavr_analysisfile1.csv:
  #
  # $1= pid_e                     $2= lastname_e
  # $3= zip_e                     $4= state_e                     $5= localitycode_e
  # $6= precinct_e        $7= precinctpart_e      $8= regdate_e
  #
  # $9= dob_e                     $10= dod_e                      $11= ageatdeath_e
  # $12= gender_e         $13= party_e
  #
  # $14= gg10_e           $15= age_gg10_e         $16= daysfromdeathgg10_e
  # $17= gp10_e           $18= age_gp10_e         $19= daysfromdeathgp10_e
  # $20= ss9_e            $21= age_ss9_e          $22= daysfromdeathss9_e
  # $23= pg8_e            $24= age_pg8_e          $25= daysfromdeathpg8_e
  #
  # $26= dp8_e            $27= pp8_e              $28= gg6_e              $29= gp6_e
  # $30= ss5_e            $31= pg4_e              $32= pp4_e
  #
  # $33= male_e           $34= female_e
  # $35= dem_e            $36= rep_e
  # $37= death_e
  #
  # $38= hid                      $39= hid_occnum
  #
  # $40= pid_a            $41= lastname_a
  # $42= zip_a            $43= state_a            $44= localitycode_a
  # $45= precinct_a       $46= precinctpart_a     $47= regdate_a
  #
  # $48= dob_a            $49= dod_a                      $50= ageatdeath_a
  # $51= gender_a         $52= party_a
  #
  # $53= gg10_a           $54= age_gg10_a         $55= daysfromdeathgg10_a
  # $56= gp10_a           $57= age_gp10_a         $58= daysfromdeathgp10_a
  # $59= ss9_a            $60= age_ss9_a          $61= daysfromdeathss9_a
  # $62= pg8_a            $63= age_pg8_a          $64= daysfromdeathpg8_a
  #
  # $65= dp8_a            $66= pp8_a              $67= gg6_a              $68= gp6_a
  # $69= ss5_a            $70= pg4_a              $71= pp4_a
  #
  # $72= male_a           $73= female_a
  # $74= dem_a            $75= rep_a
  # $76= death_a
  #
  # $77-$82 zip data
  #
  #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

## Add movement data
## ~~~~~~~~~~~~~~~~~~
  #### this script again formats cavr 2010 and adds the movement data to the final analysis file



    # convert tsv to csv # remove any pipe characters to allow for use as alternate delimiter
    sed -e 's/\t/,/g' -e 's/|//g' "$CAVR2010" > $DIRECT/cavr2010_unformatted.csv


    #########################################################
    # extract identification fields and format for matching #
    #########################################################

    # print last name and first name # remove spaces and replace empty fields with NA (not applicable but still kept in the code)
    awk -F, 'NR!=1 {print "|"$3"|"$4"|","|"$15"|"$10"|"$9"|"$12"|"$7"|"$8"|"$14"|";}' OFS=',' $DIRECT/cavr2010_unformatted.csv | sed -e 's/ //g' -e ':repeat; s/,,/,NA,/; t repeat' > $DIRECT/cavr_id.csv


    # print formatted date (yyyymmdd) to cavr_dob.csv
    awk -F, 'NR!=1 {print $27}' OFS=',' $DIRECT/cavr2010_unformatted.csv | sed 's/^\(..\)\/\(..\)\/\(....\)$/\3\1\2/' > $DIRECT/cavr_dob.csv

  rm $DIRECT/cavr2010_unformatted.csv

    # combine cavr_id.csv and cavr_dob.csv
    paste -d\, $DIRECT/cavr_id.csv $DIRECT/cavr_dob.csv > $DIRECT/cavr_pid1.csv

    rm $DIRECT/cavr_id.csv
    rm $DIRECT/cavr_dob.csv


    # add Gregorian DOB to PID
    awk -F, '{print "|"$3 $1, $2;}' OFS=, $DIRECT/cavr_pid1.csv > $DIRECT/cavr_pid.csv

    rm $DIRECT/cavr_pid1.csv

    # change all characters to uppercase and remove _ : to allow use of alternate delimiters
    tr '[:lower:]' '[:upper:]' < $DIRECT/cavr_pid.csv | sed -e 's/_//g' -e 's/://g'  > $DIRECT/cavr2010_ids.csv

    rm $DIRECT/cavr_pid.csv

  awk -F, 'FNR==NR{a[$1]=$2; next}{if ($1 in a) {print $0, a[$1];} else if (FNR==1) {print $0, "hid_e_2010"} else {print $0, "NA";}}' OFS=, $DIRECT/cavr2010_ids.csv $DIRECT/cavr_analysisfile1_withzipdat.csv | awk -F, 'NR==1{print $0, "mover_e"}NR!=1{print $0, $38!=$83}' OFS=, > $DIRECT/cavr_analysisfile2.csv

  rm $DIRECT/cavr_analysisfile1_withzipdat.csv

  awk -F, 'FNR==NR{a[$1]=$2; next}{if ($40 in a) {print $0, a[$40];} else if (FNR==1) {print $0, "hid_a_2010"} else {print $0, "NA";}}' OFS=, $DIRECT/cavr2010_ids.csv $DIRECT/cavr_analysisfile2.csv | awk -F, 'NR==1{print $0, "mover_a"}NR!=1{print $0, $38!=$83}' OFS=, > $DIRECT/cavr_analysisfile3.csv

  rm $DIRECT/cavr2010_ids.csv
  rm $DIRECT/cavr_analysisfile2.csv

  mv $DIRECT/cavr_analysisfile3.csv $DIRECT/cavr_analysisfile.csv

  ### to record movements and county removal rates for deaths 30 to 563 days prior to the Gubernatorial General Election 2010
  awk -F, '{print $14, $15 $33, $34, $35, $36, $39, $44, $53, $55, $72, $73, $74, $75, $76, $78, $81, $84, $86}' OFS=, cavr_analysisfile.csv > cavr_rec.csv


## Add registration data
## ~~~~~~~~~~~~~~~~~~~~~~
  sed -e 's/\t/,/g' -e 's/|//g' -e 's/ //g' "$CAVR2009" > $DIRECT/cavr2009_unformatted.csv

  awk -F, 'NR!=1 {print "|"$3"|"$4"|", $35, $37, $26;}' OFS=',' $DIRECT/cavr2009_unformatted.csv > $DIRECT/cavr_id.csv

  # print formatted date (yyyymmdd) to cavr_dob.csv
  awk -F, 'NR!=1 {print $27}' OFS=',' $DIRECT/cavr2009_unformatted.csv | sed 's/^\(..\)\/\(..\)\/\(....\)$/\3\1\2/' > $DIRECT/cavr_dob.csv

  rm $DIRECT/cavr2009_unformatted.csv

  # combine cavr_id.csv and cavr_dob.csv
  paste -d\, $DIRECT/cavr_id.csv $DIRECT/cavr_dob.csv > $DIRECT/cavr_pid1.csv

  rm $DIRECT/cavr_id.csv
  rm $DIRECT/cavr_dob.csv

  # add Gregorian DOB to PID
  awk -F, '{print "|"$5 $1, $2, $3, $4;}' OFS=, $DIRECT/cavr_pid1.csv > $DIRECT/cavr_pid.csv

  rm $DIRECT/cavr_pid1.csv

  # change all characters to uppercase and remove _ : to allow use of alternate delimiters
  tr '[:lower:]' '[:upper:]' < $DIRECT/cavr_pid.csv | sed -e 's/_//g' -e 's/://g'  > $DIRECT/cavr2009_ids.csv

  rm $DIRECT/cavr_pid.csv

  awk -F, 'FNR==NR{a[$1]=$2 "," $3 "," $4; next}{if ($1 in a) {print $0, a[$1];} else if (FNR==1) {print $0, "registration", "birthplace", "language"} else {print $0, "NA", "NA", "NA";}}' OFS=, $DIRECT/cavr2009_ids.csv $DIRECT/cavr_analysisfile.csv >  $DIRECT/cavr_analysisfile_withregdat.csv

